Aim
The aim of this project is to find what most influences the price of a diamond, and to be able to predict the price of an unseen data set based off the findings.
Load Libraries
library(pROC)
#Displays and analyses ROC curves
library(car)
#A companion to applied regression
library(caTools)
#Moving window statistics, GIFs, and other tools
library(ggplot2)
#Higher level graphing package
library(plotly)
library(ggcorrplot)
library(tidyverse)
Data Sourcing and cleanup
##Import packages required
###install.packages("ggcorrplot")
library(ggcorrplot)
library(tidyverse)
##Looking to load the data set Diamonds (We lacked the data for a business relevent one.)
Diamonds <- read.csv("S:/Business Solutions/ThomasT/Current/data-fellowship/Regression/Diamonds.csv")
##Data cleanup should occur now, however this data has been pre-cleaned to speed up the process.
####Would look for Null or incorrect format values that didnt fit and remove the rows/separate from the data.
Using a Correlograms to identify correlations with price.
##The aim of this part is to use a Correlograms of the entire dataset to work out what attribute most affects the price of a diamond. Will initially have to alter the categoric values to a numeric one.
Diamonds$cut <- as.integer(Diamonds$cut)
Diamonds$color <- as.integer(Diamonds$color)
Diamonds$clarity <- as.integer(Diamonds$clarity)
##Find Correlation between values
cor(Diamonds)
carat cut color clarity depth table price
carat 1.00000000 0.0171237362 0.2914367543 -0.21429037 0.02822431 0.18161755 0.92159130
cut 0.01712374 1.0000000000 0.0003042479 0.02823537 -0.19424856 0.15032703 0.03986029
color 0.29143675 0.0003042479 1.0000000000 -0.02779550 0.04727923 0.02646520 0.17251093
clarity -0.21429037 0.0282353656 -0.0277954960 1.00000000 -0.05308011 -0.08822266 -0.07153497
depth 0.02822431 -0.1942485626 0.0472792348 -0.05308011 1.00000000 -0.29577852 -0.01064740
table 0.18161755 0.1503270263 0.0264652011 -0.08822266 -0.29577852 1.00000000 0.12713390
price 0.92159130 0.0398602909 0.1725109282 -0.07153497 -0.01064740 0.12713390 1.00000000
x 0.97509423 0.0223419276 0.2702866854 -0.22572144 -0.02528925 0.19534428 0.88443516
y 0.95172220 0.0275720250 0.2635844027 -0.21761579 -0.02934067 0.18376015 0.86542090
z 0.95338738 0.0020373568 0.2682268757 -0.22426307 0.09492388 0.15092869 0.86124944
x y z
carat 0.97509423 0.95172220 0.953387381
cut 0.02234193 0.02757203 0.002037357
color 0.27028669 0.26358440 0.268226876
clarity -0.22572144 -0.21761579 -0.224263069
depth -0.02528925 -0.02934067 0.094923882
table 0.19534428 0.18376015 0.150928692
price 0.88443516 0.86542090 0.861249444
x 1.00000000 0.97470148 0.970771799
y 0.97470148 1.00000000 0.952005716
z 0.97077180 0.95200572 1.000000000
##9 Decimal places is not ideal, drop it to 2.
corr <- round(cor(Diamonds),2)
##Show me the gram.
ggcorrplot(corr, method = "circle", lab = TRUE, hc.order = TRUE)

###From this, the best correlation of price appears to be carat, so that is what I'll use.
Split the data into train and test sets
##In order to check if the prediction works, we need to split the data into training data and test data. I'll split it about 75-25 to give a good amount to work with.
split <- sample.split(Diamonds,SplitRatio = 0.75)
train <- subset(Diamonds,split==TRUE)
test <- subset(Diamonds,split==FALSE)
Analyse the curve to best begin the linear prediciton model
plot(Diamonds$price, Diamonds$carat)

linearModel <- lm(price ~ carat, data = train)
plot(linearModel)



curve(predict(linearModel, data.frame(carat=x)), lwd = 2, col = "dodgerblue", add=T)

Plot secondary graph to visualise prediction vs actual value
test$predicted.linear <- predict(linearModel, test)
pl1 <- test %>%
ggplot(aes(price,predicted.linear)) +
geom_point(alpha=0.5) +
stat_smooth() +
xlab('Actual value of diamond') +
ylab('Predicted value of diamond') +
theme_bw()
ggplotly(pl1)
`geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Find Error based on linear model
error <- test$price-test$predicted.linear
rmse <- sqrt(mean(error^2))
rmse
[1] 1527.588
Plot both non-linear and linear models.
e <- exp(1)
linearModel <- lm(price ~ carat, data = train)
##nonLinearModel <- nls(price ~ 327.7013 + 804.2829*e^(1.450356*carat), train)
##nonLinearModel <- Vectorize(nls(price ~ (327.7013 + 804.2829*e^(+1.450356*x)), train, start = list(x = carat)))
nonLinearModel <- nls(price ~ e^(b*carat) + c, train, start = list(b = 2, c = 0))
plot(train$carat, train$price, col="Red", pch=16)
##curve((327.7013 + 804.2829*e^(+1.450356*x)),lwd=3, col="dodgerblue", add = T)
curve(predict(linearModel, data.frame(carat=x)), lwd = 2, col = "dodgerblue", add=T)
curve(predict(nonLinearModel, data.frame(carat=x)), lwd = 3, col = "black", add=T)

test$predicted.nonlinear <- predict(nonLinearModel, test)
pl1 <- test %>%
ggplot(aes(price,predicted.nonlinear)) +
geom_point(alpha=0.5) +
stat_smooth() +
xlab('Actual value of price') +
ylab('Predicted value of price') +
theme_void()
ggplotly(pl1)
`geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Again like we did with the linear model we can calculate the RMSE. Then we can compare that with the AMSE from the linear model to see if any improvements have been made.
error <- test$price-test$predicted.nonlinear
rmseNL <- sqrt(mean(error^2))
rmseNL
[1] 3973.389
From this, we can see the linear model is more accurate than the non-linear one. I would argue however that the non-linear custom equation should theoretically be more accurate, yet I cannot figure out how to make that work. That will be a point for a future project. This can therefore be used on other data sets of similar making to attempt to estimate Diamond prices based on Carat.
LS0tDQp0aXRsZTogIkFjdHMgb2YgUmVncmVzc2lvbiBQcm9qZWN0Ig0Kb3V0cHV0OiBodG1sX25vdGVib29rDQotLS0NCiNBaW0NClRoZSBhaW0gb2YgdGhpcyBwcm9qZWN0IGlzIHRvIGZpbmQgd2hhdCBtb3N0IGluZmx1ZW5jZXMgdGhlIHByaWNlIG9mIGEgZGlhbW9uZCwgYW5kIHRvIGJlIGFibGUgdG8gcHJlZGljdCB0aGUgcHJpY2Ugb2YgYW4gdW5zZWVuIGRhdGEgc2V0IGJhc2VkIG9mZiB0aGUgZmluZGluZ3MuDQoNCiNMb2FkIExpYnJhcmllcw0KYGBge3J9DQpsaWJyYXJ5KHBST0MpDQojRGlzcGxheXMgYW5kIGFuYWx5c2VzIFJPQyBjdXJ2ZXMNCg0KbGlicmFyeShjYXIpDQojQSBjb21wYW5pb24gdG8gYXBwbGllZCByZWdyZXNzaW9uDQoNCmxpYnJhcnkoY2FUb29scykNCiNNb3Zpbmcgd2luZG93IHN0YXRpc3RpY3MsIEdJRnMsIGFuZCBvdGhlciB0b29scw0KDQpsaWJyYXJ5KGdncGxvdDIpDQojSGlnaGVyIGxldmVsIGdyYXBoaW5nIHBhY2thZ2UNCg0KbGlicmFyeShwbG90bHkpDQoNCmxpYnJhcnkoZ2djb3JycGxvdCkNCg0KbGlicmFyeSh0aWR5dmVyc2UpDQpgYGANCg0KDQojRGF0YSBTb3VyY2luZyBhbmQgY2xlYW51cA0KYGBge3J9DQojI0ltcG9ydCBwYWNrYWdlcyByZXF1aXJlZA0KIyMjaW5zdGFsbC5wYWNrYWdlcygiZ2djb3JycGxvdCIpDQpsaWJyYXJ5KGdnY29ycnBsb3QpDQpsaWJyYXJ5KHRpZHl2ZXJzZSkNCg0KIyNMb29raW5nIHRvIGxvYWQgdGhlIGRhdGEgc2V0IERpYW1vbmRzIChXZSBsYWNrZWQgdGhlIGRhdGEgZm9yIGEgYnVzaW5lc3MgcmVsZXZlbnQgb25lLikNCg0KRGlhbW9uZHMgPC0gcmVhZC5jc3YoIlM6L0J1c2luZXNzIFNvbHV0aW9ucy9UaG9tYXNUL0N1cnJlbnQvZGF0YS1mZWxsb3dzaGlwL1JlZ3Jlc3Npb24vRGlhbW9uZHMuY3N2IikNCg0KIyNEYXRhIGNsZWFudXAgc2hvdWxkIG9jY3VyIG5vdywgaG93ZXZlciB0aGlzIGRhdGEgaGFzIGJlZW4gcHJlLWNsZWFuZWQgdG8gc3BlZWQgdXAgdGhlIHByb2Nlc3MuDQojIyMjV291bGQgbG9vayBmb3IgTnVsbCBvciBpbmNvcnJlY3QgZm9ybWF0IHZhbHVlcyB0aGF0IGRpZG50IGZpdCBhbmQgcmVtb3ZlIHRoZSByb3dzL3NlcGFyYXRlIGZyb20gdGhlIGRhdGEuDQoNCg0KYGBgDQoNCg0KI1VzaW5nIGEgQ29ycmVsb2dyYW1zIHRvIGlkZW50aWZ5IGNvcnJlbGF0aW9ucyB3aXRoIHByaWNlLg0KYGBge3J9DQojI1RoZSBhaW0gb2YgdGhpcyBwYXJ0IGlzIHRvIHVzZSBhIENvcnJlbG9ncmFtcyBvZiB0aGUgZW50aXJlIGRhdGFzZXQgdG8gd29yayBvdXQgd2hhdCBhdHRyaWJ1dGUgbW9zdCBhZmZlY3RzIHRoZSBwcmljZSBvZiBhIGRpYW1vbmQuIFdpbGwgaW5pdGlhbGx5IGhhdmUgdG8gYWx0ZXIgdGhlIGNhdGVnb3JpYyB2YWx1ZXMgdG8gYSBudW1lcmljIG9uZS4NCg0KRGlhbW9uZHMkY3V0IDwtIGFzLmludGVnZXIoRGlhbW9uZHMkY3V0KQ0KRGlhbW9uZHMkY29sb3IgPC0gYXMuaW50ZWdlcihEaWFtb25kcyRjb2xvcikNCkRpYW1vbmRzJGNsYXJpdHkgPC0gYXMuaW50ZWdlcihEaWFtb25kcyRjbGFyaXR5KQ0KDQojI0ZpbmQgQ29ycmVsYXRpb24gYmV0d2VlbiB2YWx1ZXMNCmNvcihEaWFtb25kcykNCg0KIyM5IERlY2ltYWwgcGxhY2VzIGlzIG5vdCBpZGVhbCwgZHJvcCBpdCB0byAyLg0KY29yciA8LSByb3VuZChjb3IoRGlhbW9uZHMpLDIpDQoNCiMjU2hvdyBtZSB0aGUgZ3JhbS4NCmdnY29ycnBsb3QoY29yciwgbWV0aG9kID0gImNpcmNsZSIsICBsYWIgPSBUUlVFLCBoYy5vcmRlciA9IFRSVUUpDQoNCiMjI0Zyb20gdGhpcywgdGhlIGJlc3QgY29ycmVsYXRpb24gb2YgcHJpY2UgYXBwZWFycyB0byBiZSBjYXJhdCwgc28gdGhhdCBpcyB3aGF0IEknbGwgdXNlLiANCmBgYA0KDQoNCiNTcGxpdCB0aGUgZGF0YSBpbnRvIHRyYWluIGFuZCB0ZXN0IHNldHMNCmBgYHtyfQ0KIyNJbiBvcmRlciB0byBjaGVjayBpZiB0aGUgcHJlZGljdGlvbiB3b3Jrcywgd2UgbmVlZCB0byBzcGxpdCB0aGUgZGF0YSBpbnRvIHRyYWluaW5nIGRhdGEgYW5kIHRlc3QgZGF0YS4gSSdsbCBzcGxpdCBpdCBhYm91dCA3NS0yNSB0byBnaXZlIGEgZ29vZCBhbW91bnQgdG8gd29yayB3aXRoLg0KDQpzcGxpdCA8LSBzYW1wbGUuc3BsaXQoRGlhbW9uZHMsU3BsaXRSYXRpbyA9IDAuNzUpDQp0cmFpbiA8LSBzdWJzZXQoRGlhbW9uZHMsc3BsaXQ9PVRSVUUpDQp0ZXN0IDwtIHN1YnNldChEaWFtb25kcyxzcGxpdD09RkFMU0UpDQpgYGANCg0KDQojQW5hbHlzZSB0aGUgY3VydmUgdG8gYmVzdCBiZWdpbiB0aGUgbGluZWFyIHByZWRpY2l0b24gbW9kZWwNCmBgYHtyfQ0KcGxvdChEaWFtb25kcyRwcmljZSwgRGlhbW9uZHMkY2FyYXQpDQpsaW5lYXJNb2RlbCA8LSBsbShwcmljZSB+IGNhcmF0LCBkYXRhID0gdHJhaW4pDQpwbG90KGxpbmVhck1vZGVsKQ0KY3VydmUocHJlZGljdChsaW5lYXJNb2RlbCwgZGF0YS5mcmFtZShjYXJhdD14KSksIGx3ZCA9IDIsIGNvbCA9ICJkb2RnZXJibHVlIiwgIGFkZD1UKQ0KYGBgDQoNCiNQbG90IHNlY29uZGFyeSBncmFwaCB0byB2aXN1YWxpc2UgcHJlZGljdGlvbiB2cyBhY3R1YWwgdmFsdWUNCmBgYHtyfQ0KdGVzdCRwcmVkaWN0ZWQubGluZWFyIDwtIHByZWRpY3QobGluZWFyTW9kZWwsIHRlc3QpDQoNCnBsMSA8LSB0ZXN0ICU+JSANCiAgICBnZ3Bsb3QoYWVzKHByaWNlLHByZWRpY3RlZC5saW5lYXIpKSArIA0KICAgIGdlb21fcG9pbnQoYWxwaGE9MC41KSArIA0KICAgIHN0YXRfc21vb3RoKCkgKyANCiAgICB4bGFiKCdBY3R1YWwgdmFsdWUgb2YgZGlhbW9uZCcpICsgDQogICAgeWxhYignUHJlZGljdGVkIHZhbHVlIG9mIGRpYW1vbmQnKSArIA0KICAgIHRoZW1lX2J3KCkgDQoNCmdncGxvdGx5KHBsMSkNCmBgYA0KI0ZpbmQgRXJyb3IgYmFzZWQgb24gbGluZWFyIG1vZGVsDQpgYGB7cn0NCmVycm9yIDwtIHRlc3QkcHJpY2UtdGVzdCRwcmVkaWN0ZWQubGluZWFyIA0KICBybXNlIDwtIHNxcnQobWVhbihlcnJvcl4yKSkgDQogIHJtc2UgDQoNCmBgYA0KDQojUGxvdCBib3RoIG5vbi1saW5lYXIgYW5kIGxpbmVhciBtb2RlbHMuDQpgYGB7cn0NCmUgPC0gZXhwKDEpDQpsaW5lYXJNb2RlbCA8LSBsbShwcmljZSB+IGNhcmF0LCBkYXRhID0gdHJhaW4pDQoNCiNFeHRyYSBwYXJ0cyBrZXB0IGZvciBwb3N0ZXJpdHkuDQojI25vbkxpbmVhck1vZGVsIDwtIG5scyhwcmljZSB+IDMyNy43MDEzICsgODA0LjI4MjkqZV4oMS40NTAzNTYqY2FyYXQpLCB0cmFpbikNCiMjbm9uTGluZWFyTW9kZWwgPC0gVmVjdG9yaXplKG5scyhwcmljZSB+ICgzMjcuNzAxMyArIDgwNC4yODI5KmVeKCsxLjQ1MDM1Nip4KSksIHRyYWluLCBzdGFydCA9IGxpc3QoeCA9IGNhcmF0KSkpDQojI2N1cnZlKCgzMjcuNzAxMyArIDgwNC4yODI5KmVeKCsxLjQ1MDM1Nip4KSksbHdkPTMsIGNvbD0iZG9kZ2VyYmx1ZSIsIGFkZCA9IFQpDQoNCg0Kbm9uTGluZWFyTW9kZWwgPC0gbmxzKHByaWNlIH4gZV4oYipjYXJhdCkgKyBjLCB0cmFpbiwgc3RhcnQgPSBsaXN0KGIgPSAyLCBjID0gMCkpDQpwbG90KHRyYWluJGNhcmF0LCB0cmFpbiAkcHJpY2UsIGNvbD0iUmVkIiwgcGNoPTE2KSANCg0KY3VydmUocHJlZGljdChsaW5lYXJNb2RlbCwgZGF0YS5mcmFtZShjYXJhdD14KSksIGx3ZCA9IDIsIGNvbCA9ICJkb2RnZXJibHVlIiwgIGFkZD1UKQ0KY3VydmUocHJlZGljdChub25MaW5lYXJNb2RlbCwgZGF0YS5mcmFtZShjYXJhdD14KSksIGx3ZCA9IDMsIGNvbCA9ICJibGFjayIsICBhZGQ9VCkNCg0KYGBgDQoNCg0KYGBge3J9DQp0ZXN0JHByZWRpY3RlZC5ub25saW5lYXIgPC0gcHJlZGljdChub25MaW5lYXJNb2RlbCwgdGVzdCkNCg0KcGwxIDwtIHRlc3QgJT4lIA0KICAgIGdncGxvdChhZXMocHJpY2UscHJlZGljdGVkLm5vbmxpbmVhcikpICsgDQogICAgZ2VvbV9wb2ludChhbHBoYT0wLjUpICsgDQogICAgc3RhdF9zbW9vdGgoKSArIA0KICAgIHhsYWIoJ0FjdHVhbCB2YWx1ZSBvZiBwcmljZScpICsgDQogICAgeWxhYignUHJlZGljdGVkIHZhbHVlIG9mIHByaWNlJykgKyANCiAgICB0aGVtZV92b2lkKCkgDQoNCmdncGxvdGx5KHBsMSkNCmBgYA0KDQoNCkFnYWluIGxpa2Ugd2UgZGlkIHdpdGggdGhlIGxpbmVhciBtb2RlbCB3ZSBjYW4gY2FsY3VsYXRlIHRoZSBSTVNFLiBUaGVuIHdlIGNhbiBjb21wYXJlIHRoYXQgd2l0aCB0aGUgQU1TRSBmcm9tIHRoZSBsaW5lYXIgbW9kZWwgdG8gc2VlIGlmIGFueSBpbXByb3ZlbWVudHMgaGF2ZSBiZWVuIG1hZGUuIA0KDQogDQoNCmBgYHtyfSANCmVycm9yIDwtIHRlc3QkcHJpY2UtdGVzdCRwcmVkaWN0ZWQubm9ubGluZWFyIA0Kcm1zZU5MIDwtIHNxcnQobWVhbihlcnJvcl4yKSkgDQpybXNlTkwgDQpgYGAgDQpGcm9tIHRoaXMsIHdlIGNhbiBzZWUgdGhlIGxpbmVhciBtb2RlbCBpcyBtb3JlIGFjY3VyYXRlIHRoYW4gdGhlIG5vbi1saW5lYXIgb25lLiBJIHdvdWxkIGFyZ3VlIGhvd2V2ZXIgdGhhdCB0aGUgbm9uLWxpbmVhciBjdXN0b20gZXF1YXRpb24gc2hvdWxkIHRoZW9yZXRpY2FsbHkgYmUgbW9yZSBhY2N1cmF0ZSwgeWV0IEkgY2Fubm90IGZpZ3VyZSBvdXQgaG93IHRvIG1ha2UgdGhhdCB3b3JrLiBUaGF0IHdpbGwgYmUgYSBwb2ludCBmb3IgYSBmdXR1cmUgcHJvamVjdC4gVGhpcyBjYW4gdGhlcmVmb3JlIGJlIHVzZWQgb24gb3RoZXIgZGF0YSBzZXRzIG9mIHNpbWlsYXIgbWFraW5nIHRvIGF0dGVtcHQgdG8gZXN0aW1hdGUgRGlhbW9uZCBwcmljZXMgYmFzZWQgb24gQ2FyYXQuDQo=